############################################
#
# Marcus Bischof
# Divvy EDA : Chicago
#
############################################
# Operations
import pandas as pd
import numpy as np
# Image libs
from PIL import Image, ImageChops
from folium.raster_layers import ImageOverlay
# Custom functions
from functions_for_eda import *
# Data viz
from matplotlib import pyplot as plt
import seaborn as sns
# Maps
import folium
from folium import plugins
# Jupyter display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# Do we need to load raw .csv, and create a single memory efficient .pkl?
CREATE_SMALL_MEMORY_SET = False
if CREATE_SMALL_MEMORY_SET:
create_memory_efficient_pkl()
# Do we want to break up the 860+mb memory efficient .pkl into 10 slices?
CREATE_SLICES_OF_MEMORY_EFFICIENT_PKL = False
if CREATE_SLICES_OF_MEMORY_EFFICIENT_PKL:
create_slices_of_memory_efficient_pkl()
# One tenth of the divvy data, to be used for exploration.
df = pd.read_pickle('../data/interim/df_0_1000000.pkl')
# Neighborhoods and geo.
n_hood = load_geojson_neighborhood_data()
stations = pd.read_pickle('../data/processed/stations.pkl')
To create a map.
create_chicago_map()
To add points to a map. **Note: icon must be a font-awesome icon.
add_points_to_map
(
folium_map_obj, color, icon, points
)
To add a neighborhood overlay.
add_neighborhood_overlay_to_map
(
folium_map_obj, neighborhood_name, color, n_hood_df_polylines
)
To trim an image such that we can add it to a folium map element (I did not write this).
trim(image_path)
m = create_chicago_map()
# Thanks to https://alysivji.github.io/getting-started-with-folium.html
stations_starts = stations[['lat', 'long']].values
# plot heatmap
m.add_child(plugins.HeatMap(stations_starts, radius=10))
m
We see a signficant concentration of divvy stations in:
- The loop
- Northern neighborhoods on the lake like Lincoln Park
For our analysis, let's first understand the data broadly.
We will then start with a neighborhood centric approach to analyzing the data. I believe that since neighborhoods contain residents that may share certain commonalities, we may see interesting trends among and between various neighborhoods.
df.head()
g = sns.catplot(
x="month", y="tripduration", hue="usertype",
data=df, kind="violin"
)
df[['month', 'usertype', 'tripduration']].groupby(['month', 'usertype']).mean().dropna(how='any')
It seems as though we are only getting customer data from July.
How much does trip duration vary across event types?
g = sns.catplot(
x="events", y="tripduration",
data=df, kind="violin"
)
plt.title('Do trip durations vary widely across weather events?')
g = sns.lineplot(x='temperature', y='tripduration', data=df.groupby(['temperature']).mean()['tripduration'].reset_index())
plt.title('Looking at our range of recorded temps, what is the average trip duration per temp?')
As suspected, tripduration goes up as temperature goes up. I suspect this is mostly due to the fact that people enjoy taking longer bike trips in nice weather.
Let's quickly confirm that this temperature data makes sense.
df['ym'] = df.starttime.apply(
lambda x : str(x.split(' ')[0].split('-')[0]) + str(x.split(' ')[0].split('-')[1])
)
g = sns.lineplot(x='ym', y='temperature', data=df.groupby(['ym']).mean()['temperature'].reset_index().sort_values(by='ym'))
plt.title('Look through our date range and confirm average temps are valid.')
trip_durations_by_hood = df[['from_neighborhood', 'tripduration']].groupby(['from_neighborhood']).agg(['count', 'mean']).sort_values([('tripduration', 'mean')], ascending=False).reset_index()
trip_durations_by_hood.head()
Let's add the top 5 neighborhoods (by average tripduration) to the map. Anything in common here?
for top_hood in ['Edgewater', 'Museum Campus', 'Little Village', 'Douglas', 'Gold Coast']:
add_neighborhood_overlay_to_map(m, top_hood, 'red', n_hood)
m
I am certainly gettting the impression that the top neighborhoods in terms of average trip duration are actually neighborhoods with a small amount of stations, this makes sense.
We will do the following:
- The top neighborhoods (1 --> 13): yellow
- The middle neighborhoods (14 --> 26): green
- The bottom neighborhoods (27 --> 39): blue
trip_durations_by_hood['color'] = pd.cut(np.array(trip_durations_by_hood['tripduration']['mean']), 3, labels=["yellow", "green", "blue"])
neighborhood_map = create_chicago_map()
for neighborhood in trip_durations_by_hood.itertuples():
add_neighborhood_overlay_to_map(neighborhood_map, neighborhood[1], neighborhood[4], n_hood)
add_image_to_map(
neighborhood_map, 'avg_trip_duration.png', 41.954883, -87.594551,
41.894883, -87.494551
)
neighborhood_map
df[['week', 'tripduration']].groupby('week').mean().plot.bar(title="Avg. Trip Duration per Week")
Confirm below that week one corresponds to the first week of january as expected. The chart above makes sense now, as we would expect the average duration of trips to be shorter in the winter and longer in the summer.
df[df.week == 1].head()
We want to understand how capacity can potentially be analyzed and predicted.
Let's first understand how neighborhoods differ when it comes to the percentage of trips that end at a different neighborhood vs. trips that end in the same neighborhood. Likewise, let's examine the same statistic for station to station trips.
n_hood_different_neighborhood_ratios = []
for n in df.from_neighborhood.unique():
n_hood_different_neighborhood_ratios.append((n, len(df[(df.from_neighborhood == n) & (df.to_neighborhood != n)]) / len(df[df.from_neighborhood == n])))
n_hood_different_neighborhood_ratios.sort(key=lambda tup: tup[1])
neighborhood_diff_trip_end_density = create_chicago_map()
for n, density in n_hood_different_neighborhood_ratios:
add_neighborhood_overlay_to_map_with_fill(neighborhood_diff_trip_end_density, n, 'yellow', n_hood, density)
add_image_to_map(
neighborhood_diff_trip_end_density, 'same_neighborhood_density.png', 41.954883, -87.594551,
41.894883, -87.394551
)
neighborhood_diff_trip_end_density
df.head()
# Let's validate that many trips take place between typical work hours as hypothesized above.
df[['week','hour']].groupby(['hour']).count().reset_index().rename(columns={"week": "count"}).plot.bar(x='hour', y='count', title='Amount of trips that start during this hour (military hours)')
The bimodal distribution above with peaks at __8am__ in the morning and __5pm__ in the evening suggest that my hypothesis regarding work trips is at least partially correct.
df.head()
tmp = df[df.hour.isin([8, 17])].groupby(['from_neighborhood', 'hour']).count().sort_values(by='trip_id', ascending=True).reset_index()
tmp = tmp[['from_neighborhood', 'hour', 'trip_id']]
tmp.columns = ['from_neighborhood', 'hour', 'count']
ax = sns.barplot(x="from_neighborhood", y="count", hue="hour", data=tmp)
plt.xticks(rotation='vertical')
ax.set_title('Where do trips START: 8am vs. 5pm?')
Notice: The major differences in trips originating in: The Loop, River North, Streeterville!
(For those unfamiliar with Chicago, these three neighborhoods are "downtown", and a lot of people work in these neighborhoods, they are not too residential).
Let's examine capacity as it relates to various stations.
df.head()
for station in df[df.from_neighborhood == 'Wicker Park'].from_station_name.unique():
# Pick a station, calculate the amount of trips from that station per day, let's start with a station near and dear to my heart, Ashland Ave & Division St
trips_from = df[df.from_station_name == station][
['dpcapacity_start', 'ym', 'same_neighborhood_trip']
].groupby(['ym', 'same_neighborhood_trip']).count().reset_index().sort_values(by='ym', ascending=True)
trips_from.columns = ['year_month', 'same_hood', 'count']
ax = sns.lineplot(x="year_month", y="count", hue="same_hood", data=trips_from)
ax.set_title(station)
plt.show()
for n in df.from_neighborhood.unique():
# Pick a station, calculate the amount of trips from that station per day, let's start with a station near and dear to my heart, Ashland Ave & Division St
trips_from = df[df.from_neighborhood == n][
['dpcapacity_start', 'ym', 'same_neighborhood_trip']
].groupby(['ym', 'same_neighborhood_trip']).count().reset_index().sort_values(by='ym', ascending=True)
trips_from.columns = ['year_month', 'same_hood', 'count']
ax = sns.lineplot(x="year_month", y="count", hue="same_hood", data=trips_from)
ax.set_title('{}: Trips (count) ending in same neighborhoods vs. different neighborhoods'.format(n))
plt.show()
g = sns.catplot(
x="same_neighborhood_trip", y="tripduration", col="gender",
data=df, kind="box"
)
plt.title('Examine trip duration as it relates to gender, does this looking different for same vs. diff neighborhood trips?\n\n')
Girls are representing and biking longer! (IQR 75th is higher)
How far off are capacities of from and to trips?
df['dpcapacity_start'] = df['dpcapacity_start'].astype('float')
df['dpcapacity_end'] = df['dpcapacity_end'].astype('float')
df['capacity_diff'] = df['dpcapacity_start'] - df['dpcapacity_end']
# Juicy information ...
inflow_outflow_df = df[['from_neighborhood', 'capacity_diff']].groupby(['from_neighborhood']).mean().reset_index()
outflow_max = inflow_outflow_df['capacity_diff'].max()
inflow_min = inflow_outflow_df['capacity_diff'].min()
inflow_outflow_map = create_chicago_map()
for _, n, density in inflow_outflow_df.itertuples():
if density < 0:
c, d = 'red', abs(density) / abs(inflow_min)
else:
c, d = 'green', density / outflow_max
add_neighborhood_overlay_to_map_with_fill(inflow_outflow_map, n, c, n_hood, d)
add_image_to_map(
inflow_outflow_map, 'inflow_outflow_rack_capacity.png', 41.954883, -87.594551,
41.894883, -87.394551
)
inflow_outflow_map
Above is my favorite map.
Context
1) Red indicates that on average, and trips originating in a given neighborhood went from a station with dpcapacity LESS THAN the station where they ended the trip. Likewise, Green indicates that on average, and trips originating in a given neighborhood went from a station with dpcapacity MORE THAN the station where they ended the trip.
2) Essentially, this helps us understand the flow of traffic. What you are seeing is red neighborhoods MOSTLY flowing into green neighborhoods, and vice versa. These flows could aid in any effort to add new capacity OR rotate existing capacity. It makes sense that the residential areas tend to be red, while the working areas tend to be green.